library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
all_counts <- readRDS(file= "Data/pc_count_matrix.rds")
avg_counts <- readRDS(file="Data/avg_pc_count_matrix.rds")
diff_counts <- readRDS(file="Data/diff_pc_count_matrix.rds")
meta <- suppressMessages(readr::read_tsv("Data/complete_meta_data.tsv"))
TFs<- suppressMessages(read_delim("mouse_tfs.tsv")) 
all_counts <- all_counts[rownames(all_counts) %in% (TFs %>% pull(Symbol)),]
avg_counts <- avg_counts[rownames(avg_counts) %in% (TFs %>% pull(Symbol)),]
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="embryonic facial prominence", "facial prominence"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
tissue_types <- unique(meta %>% pull(tissue_type))
collapsed_dev_counts <- matrix(data=NA, nrow=nrow(all_counts),ncol=length(tissue_types))
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  tissue_subset_count <- all_counts[,tissue_subset_ids]
  tissue_avg <- rowMeans(tissue_subset_count)
  collapsed_dev_counts[,walker] <- tissue_avg
  walker <- walker + 1
}
colnames(collapsed_dev_counts) <- tissue_types
rownames(collapsed_dev_counts) <- rownames(all_counts)

Investigate the most highly expressed gene (sum of all stages) for each tissue type

maxes_list <- list()
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_sums <- apply(all_counts[,tissue_subset_ids],1, sum)
  maxes_list[tissue] <- names(all_sums)[which.max(all_sums)]
}
for (name in names(maxes_list)){
  print(paste0("The most highly expressed protein in ", name, " : ", maxes_list[name]))
}
## [1] "The most highly expressed protein in forebrain : Hmgb1"
## [1] "The most highly expressed protein in midbrain : Hmgb1"
## [1] "The most highly expressed protein in hindbrain : Ybx1"
## [1] "The most highly expressed protein in facial prominence : Hmgb1"
## [1] "The most highly expressed protein in heart : Cnbp"
## [1] "The most highly expressed protein in limb : Hmgb1"
## [1] "The most highly expressed protein in liver : Cnbp"
## [1] "The most highly expressed protein in neural tube : Ybx1"
## [1] "The most highly expressed protein in lung : Hmgb1"
## [1] "The most highly expressed protein in stomach : Hmgb1"
## [1] "The most highly expressed protein in intestine : Hmgb1"
## [1] "The most highly expressed protein in kidney : Hmgb1"
## [1] "The most highly expressed protein in thymus : Hmgb1"
## [1] "The most highly expressed protein in muscle tissue : Cnbp"
## [1] "The most highly expressed protein in spleen : Hmgb1"
## [1] "The most highly expressed protein in urinary bladder : Cnbp"
## [1] "The most highly expressed protein in adrenal gland : Cnbp"

Same thing as above but excluding the mitochondria

maxes_list <- list()
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_sums <- apply(all_counts[,tissue_subset_ids],1, sum)
  all_sums <- all_sums[!sapply(names(all_sums), startsWith, "mt-")]
  maxes_list[tissue] <- names(all_sums)[which.max(all_sums)]
}
for (name in names(maxes_list)){
  print(paste0("The most highly expressed protein in ", name, " : ", maxes_list[name]))
}
## [1] "The most highly expressed protein in forebrain : Hmgb1"
## [1] "The most highly expressed protein in midbrain : Hmgb1"
## [1] "The most highly expressed protein in hindbrain : Ybx1"
## [1] "The most highly expressed protein in facial prominence : Hmgb1"
## [1] "The most highly expressed protein in heart : Cnbp"
## [1] "The most highly expressed protein in limb : Hmgb1"
## [1] "The most highly expressed protein in liver : Cnbp"
## [1] "The most highly expressed protein in neural tube : Ybx1"
## [1] "The most highly expressed protein in lung : Hmgb1"
## [1] "The most highly expressed protein in stomach : Hmgb1"
## [1] "The most highly expressed protein in intestine : Hmgb1"
## [1] "The most highly expressed protein in kidney : Hmgb1"
## [1] "The most highly expressed protein in thymus : Hmgb1"
## [1] "The most highly expressed protein in muscle tissue : Cnbp"
## [1] "The most highly expressed protein in spleen : Hmgb1"
## [1] "The most highly expressed protein in urinary bladder : Cnbp"
## [1] "The most highly expressed protein in adrenal gland : Cnbp"

Top 5 Universally highly expressed protein across all tissues regardless of stages (without processing)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  collapsed_matrices[,walker] <- all_avgs
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums,decreasing=TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: Hmgb1"
## [1] "The highests expressed protein acrossed all tissue in order 2: Cnbp"
## [1] "The highests expressed protein acrossed all tissue in order 3: Ybx1"
## [1] "The highests expressed protein acrossed all tissue in order 4: Sub1"
## [1] "The highests expressed protein acrossed all tissue in order 5: Tsc22d1"
## [1] "The highests expressed protein acrossed all tissue in order 6: Id2"
## [1] "The highests expressed protein acrossed all tissue in order 7: Id3"
## [1] "The highests expressed protein acrossed all tissue in order 8: Hmgb3"
## [1] "The highests expressed protein acrossed all tissue in order 9: Hmgb2"
## [1] "The highests expressed protein acrossed all tissue in order 10: Nme2"

Expression profile of the highest expressed protein across all tissues Hba-a2 Extract only Hba-a2 information

library(ggplot2)
target_protein <- "Hmgb1"
unique_dev_stages <- unique(meta %>% pull(dev_stage))
Hba_matrix <- matrix(data=NA, nrow=length(tissue_types), ncol=3)
colnames(Hba_matrix) <- c("count", "dev_stage", "tissue_type")
Hba_frame <- data.frame(Hba_matrix)
walker <- 1
for (exp_id in names(avg_counts[target_protein,])){
    Hba_frame[walker, "count"] <- avg_counts[target_protein, exp_id]
    Hba_frame[walker, "dev_stage"] <- meta %>% filter(id==exp_id) %>% pull(dev_stage)
    Hba_frame[walker, "tissue_type"] <- meta %>% filter(id==exp_id) %>% pull(tissue_type)
    walker <- walker + 1
}
Hba_frame$count <- log2(Hba_frame$count) 

Plot the extracted information

library(ggrepel)
make_label <- function(row){
  if (row['dev_stage'] == max(Hba_frame %>% filter(tissue_type==row['tissue_type']) %>% pull(dev_stage))){
    #print(max(Hba_frame %>% filter(tissue_type==tissue) %>% pull(dev_stage)))\
     return(as.character(row['tissue_type']))
  }else {

    return(NA_character_)
  }
}
Hba_frame[,"label"] <- Hba_frame %>% apply(1,make_label)
# Hba_frame <- Hba_frame %>%
#   mutate(label = if_else(dev_stage == max(Hba_frame %>% filter()), as.character(tissue_type), NA_character_))
ggplot(Hba_frame, aes(x=dev_stage, y=count, group=tissue_type, colour=tissue_type)) + geom_line() + geom_point() + expand_limits(x= c(4, 10)) +
  # geom_text_repel(
  #   aes(label = gsub("^.*$", " ", label)),
  #   # This will force the correct position of the link's right end.   

  #   segment.curvature = -0.1,
  #   segment.square = TRUE,
  #   segment.color = 'grey',
  #   box.padding = 0.1,
  #   point.padding = 0.6,
  #   nudge_x = 1,
  #   nudge_y = 1,
  #   force = 15,
  #   hjust = 0,
  #   direction = "y",
  #   na.rm = TRUE,
  #   xlim = c(5, 10),
  #   ylim = c(0, 150000),
  # ) +
  geom_text_repel(data = . %>% filter(!is.na(label)),
                  aes(label = paste0("  ", label)),
                  #segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1,
                  nudge_y = 1,
                 force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE, 
                  xlim = c(5, 10),
                  ylim = c(10,17),
)                   +
  ylab("counts(log2)") + ggtitle("Expression pattern of Hba-a2 across tissues") + theme(plot.title = element_text(hjust = 0.5))

Top 5 Universally highly expressed protein across all tissues regardless of stages (by percentage in tissues)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  percent_counts <- all_avgs/sum(all_avgs)
  collapsed_matrices[,walker] <- percent_counts
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums,decreasing=TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: Hmgb1"
## [1] "The highests expressed protein acrossed all tissue in order 2: Cnbp"
## [1] "The highests expressed protein acrossed all tissue in order 3: Sub1"
## [1] "The highests expressed protein acrossed all tissue in order 4: Ybx1"
## [1] "The highests expressed protein acrossed all tissue in order 5: Tsc22d1"
## [1] "The highests expressed protein acrossed all tissue in order 6: Id2"
## [1] "The highests expressed protein acrossed all tissue in order 7: Id3"
## [1] "The highests expressed protein acrossed all tissue in order 8: Hmgb2"
## [1] "The highests expressed protein acrossed all tissue in order 9: Nme2"
## [1] "The highests expressed protein acrossed all tissue in order 10: Hmgb3"

Top 5 Universally highly expressed protein across all tissues regardless of stages (by ranking)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  all_ranking <- rank(all_avgs)
  collapsed_matrices[,walker] <- all_ranking
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums, decreasing = TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: Hmgb1"
## [1] "The highests expressed protein acrossed all tissue in order 2: Cnbp"
## [1] "The highests expressed protein acrossed all tissue in order 3: Sub1"
## [1] "The highests expressed protein acrossed all tissue in order 4: Ybx1"
## [1] "The highests expressed protein acrossed all tissue in order 5: Tsc22d1"
## [1] "The highests expressed protein acrossed all tissue in order 6: Id3"
## [1] "The highests expressed protein acrossed all tissue in order 7: Id2"
## [1] "The highests expressed protein acrossed all tissue in order 8: Hmgb2"
## [1] "The highests expressed protein acrossed all tissue in order 9: Csde1"
## [1] "The highests expressed protein acrossed all tissue in order 10: Phb"

expression distribution acrossed different tissues (box plot)

library(cowplot)
all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  ggplot(data=all_counts_melted, aes(x=dev_stage, y=log2(count+1))) + geom_boxplot() +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12) + geom_hline(yintercept=median(log2(all_counts+1)), colour="red")

expression distribution acrossed different tissues (histogram)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  suppressWarnings(ggplot(data=all_counts_melted, aes(x=log2(count+1), fill=dev_stage)) + geom_histogram(binwidth = 0.5) +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12))

expression distribution acrossed different tissues (histogram log transformed)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  suppressWarnings(ggplot(data=all_counts_melted, aes(x=log2(count+1), y=..density.. ,fill=dev_stage)) + geom_histogram(binwidth = 0.5) +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12))

expression distribution acrossed different tissues (box plot collapsed dev_stages)

melted_collapsed_dev <- pivot_longer(data.frame(collapsed_dev_counts), cols=(everything()), names_to = "tissue_type", values_to = "count")
ggplot(data=melted_collapsed_dev, aes(y=log2(count+1), x=tissue_type)) + geom_boxplot() + theme_cowplot(12) + theme(axis.text.x = element_text(angle=45, hjust=1, size=15)) + xlab("tissue types") + geom_hline(yintercept=median(log2(all_counts+1)), colour="red") +  theme(text = element_text(size=20))

expression distribution acrossed different tissues (histogram collapsed by dev_stage)

  suppressWarnings(ggplot(data=melted_collapsed_dev, aes(x=log2(count+1), fill=tissue_type, y=..density..)) + geom_histogram(binwidth = 0.5) + theme_cowplot(12))  

expression distribution acrossed different tissues (freq_plot collapsed by dev_stage)

  ggplot(data=melted_collapsed_dev, aes(x=log2(count+1), y=..density.., colour=tissue_type)) + geom_freqpoly(binwidth=0.5) + theme_cowplot(12) + scale_x_continuous(limits=c(0, 20))

Mean-variance (collpased all samples)

library(ggrepel)
all_counts_mean_var <- all_counts
all_counts_mean_var <- cbind(all_counts_mean_var, mean=apply(all_counts,1,mean))
all_counts_mean_var <- cbind(all_counts_mean_var, variance=apply(all_counts,1,var))
ggplot(data=data.frame(all_counts_mean_var), aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() + theme_cowplot(12) + 
  geom_text_repel(data=data.frame(all_counts_mean_var) %>% rownames_to_column(var="rowname") %>% slice_max(variance, n=10), aes(x=log2(mean+1), y=log2(variance), label=rowname, colour="red")) +
  geom_text_repel(data=data.frame(all_counts_mean_var) %>% rownames_to_column(var="rowname") %>% slice_max(mean, n=10), aes(x=log2(mean+1), y=log2(variance), label=rowname, colour="blue")) + scale_colour_discrete(name = "Top elements", labels = c("Variance", "Mean"))

Mean and variance all together but hue by organism type (collapsed by tissue type)

collapsed_matrices <- data.frame(mean=double(), variance=double(), tissue=character(), row=character())
colnames(collapsed_matrices) <- c("mean", "variance", "tissue")
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_means <- apply(all_counts[,tissue_subset_ids],1, mean)
  all_variance <- apply(all_counts[,tissue_subset_ids],1, var)
  temp_matrix <- data.frame(mean=all_means, variance=all_variance, tissue) %>% rownames_to_column("rowname")
  collapsed_matrices <- rbind(collapsed_matrices, temp_matrix)
  
}
c25 <- c(
  "dodgerblue2", "#E31A1C", # red
  "green4",
  "#6A3D9A", # purple
  "#FF7F00", # orange
  "black", "gold1",
  "skyblue2", "#FB9A99", # lt pink
  "palegreen2",
  "#CAB2D6", # lt purple
  "#FDBF6F", # lt orange
  "gray70", "khaki2",
  "maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
  "darkturquoise", "green1", "yellow4", "yellow3",
  "darkorange4", "brown"
)

ggplot(data=collapsed_matrices, aes(x=log2(mean+1), y=log2(variance+1), colour=tissue)) + geom_point()+ theme_cowplot(12) + 
  geom_text_repel(data=collapsed_matrices %>% slice_max(variance+mean, n=15), aes(x=log2(mean+1), y=log2(variance+1), label=rowname, colour=tissue, shape=tissue)) + scale_color_manual(values = c25) + theme(legend.key.size = unit(1, 'cm'),  legend.key.height = unit(1, 'cm'), legend.key.width = unit(1, 'cm'),  text = element_text(size=20))

Variability across tissue types

ggplot(data=collapsed_matrices, aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() +  facet_wrap(~ tissue, ncol=3) + theme_cowplot(12) 

cluster correlation heatmap

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)
# Get some nicer colours
mypalette <- brewer.pal(11, "RdYlBu")
# http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
morecols <- colorRampPalette(mypalette)
# Set up colour vector for celltype variable
col.cell <- (c("purple","orange")[meta$tissue_type])[1:(length(meta$tissue_type)/2)]
# Plot the 
avg_counts_tissue_name <- avg_counts
colnames(avg_counts_tissue_name) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
heatmap.2(log2(avg_counts_tissue_name+1)[(order(apply(avg_counts,1,var), decreasing = TRUE))[1:50],], 
          col=rev(morecols(50)),
          trace="column", 
          main="Top 50 most variable genes across samples",
          ColSideColors=col.cell,
          scale = "row")

cluster correlation heatmap (only TFs)

library(gplots)
library(RColorBrewer)
# Get some nicer colours
mypalette <- brewer.pal(11, "RdYlBu")
# http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
morecols <- colorRampPalette(mypalette)
# Set up colour vector for celltype variable
col.cell <- (c("purple","orange")[meta$tissue_type])[1:(length(meta$tissue_type)/2)]
# Plot the heatmap
avg_counts_tissue_name <- avg_counts
colnames(avg_counts_tissue_name) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
heatmap.2(log2(avg_counts_tissue_name+1)[(order(apply(avg_counts,1,var), decreasing = TRUE))[1:10],], 
          col=rev(morecols(length(TFs))),
          trace="column", 
          main="Top 10 most variable genes across samples",
          ColSideColors=col.cell,
          scale="row")

Midbrain cluster correlation heatmap (only TFs)

# Set up colour vector for celltype variable
col.cell <- (c("purple","orange")[meta$tissue_type])[1:(length(meta %>% filter(tissue_type=="midbrain") %>% pull(id)))]
# Plot the heatmap
midbrain_id <- meta %>% filter(tissue_type == "midbrain") %>% pull(id)
midbrain_counts <- all_counts[,midbrain_id]
colnames(midbrain_counts) <- sapply(colnames(midbrain_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(dev_stage)))})
heatmap.2(log2(midbrain_counts+1)[(order(apply(midbrain_counts,1,var), decreasing = TRUE))[1:10],], 
          col=rev(morecols(length(TFs))),
          trace="column", 
          main="TOP 10 variable genes in midbrain",
          ColSideColors=col.cell,
          scale="row")

Hindbrain cluster correlation heatmap (only TFs) find some individual genes and describe

# Set up colour vector for celltype variable
col.cell <- (c("purple","orange")[meta$tissue_type])[1:(length(meta %>% filter(tissue_type=="hindbrain") %>% pull(id)))]
# Plot the heatmap
hindbrain_id <- meta %>% filter(tissue_type == "hindbrain") %>% pull(id)
hindbrain_counts <- all_counts[,hindbrain_id]
colnames(hindbrain_counts) <- sapply(colnames(hindbrain_counts), function(x){temp_row <- meta %>% filter(id==x) 
                                                                             return(paste0(temp_row %>% pull(dev_stage)))})
heatmap.2(log2(hindbrain_counts+1)[(order(apply(hindbrain_counts,1,var), decreasing = TRUE))[1:10],], 
          col=rev(morecols(length(TFs))),
          trace="column", 
          main="Top 10 variable genes in hindbrain",
          ColSideColors=col.cell,
          scale="row")

ggplot(data=collapsed_matrices, aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() +  facet_wrap(~ tissue, ncol=3) + theme_cowplot(12) 

TFCounts.avg <- data.frame(avg_counts) %>% rownames_to_column("rowname")
TFCounts.avg <- TFCounts.avg %>% gather("id", "counts", -rowname)
TFCounts.avg <- TFCounts.avg %>% left_join((meta %>% select(tissue_type, dev_stage, id)), by="id")
TFCounts.Hmgb1 <- TFCounts.avg %>% filter(rowname == "Hmgb1")

ggplot(data=TFCounts.Hmgb1, aes(y=log2(counts+1), x=dev_stage, group=tissue_type, colour=tissue_type)) + geom_line() + geom_point() + theme(axis.text.x = element_text(size = , angle = 45, hjust = 1)) + theme_cowplot() + scale_color_manual(values = c25) + theme(text = element_text(size=20), legend.key.height = unit(1, 'cm'))